In [1]:
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.cluster import KMeans
from sklearn import datasets
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
import random
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
In [2]:
# 10대 매출비율,20대 매출비율 평균영업개월수 점포수 총 매출액
data=pd.read_csv("plus_living.csv")
data["total_living_people"]=data["total_living_people"] /1000
data["idx"]=np.arange(31547)
del data['Unnamed: 0']
del data['cm_code_name']

st_data=pd.DataFrame(data,columns=["10's_sales_rate","20's_sales_rate","30's_sales_rate","40's_sales_rate","50's_sales_rate","60's_sales_rate","simillar_store_number"])

# standardScaler 함수 사용하여 data 스케일링, 에러는 한글때문임, 한글 외에 모든 데이터 스케일링 됌
def standardScaler(header_list):
    over_array = st_data[header_list]
    a = pd.DataFrame(over_array)
    a["idx"] = np.arange(31547)
    train_data = a.values
    
    standardScaler = StandardScaler()
    print(standardScaler.fit(train_data))
    train_data_standardScaled = standardScaler.transform(train_data)
    st_df = pd.DataFrame(train_data_standardScaled, columns=['standard_over','B'])
    del st_df['B']
    st_data[header_list] = st_df
    
# columns들중에서 선별하여 스케일링 수행
header = list(data)
header
for a in header:
    if(a == "10's_sales_rate" or  a == "20's_sales_rate" or a == "30's_sales_rate" or  a == "40's_sales_rate" or a == "50's_sales_rate" or  a == "60's_sales_rate" or a == "simillar_store_number"):
        standardScaler("{}".format(a))

#clustering
test_kmeans=pd.DataFrame(st_data,columns=["10's_sales_rate","20's_sales_rate","30's_sales_rate","40's_sales_rate","50's_sales_rate","60's_sales_rate","simillar_store_number"])
data_points=test_kmeans.values
kmeans=KMeans(n_clusters=5).fit(data_points)
test_kmeans['cluster_id']=kmeans.labels_
df = test_kmeans.copy()
test_kmeans["idx"] = np.arange(31547)

test_kmeans=pd.DataFrame(test_kmeans,columns=["cluster_id","idx"])
new=pd.merge(data,test_kmeans,on=["idx"])
del new["idx"]
test0 = rateof_1020_business_stnum0=new[new['cluster_id']==0]
test1 = rateof_1020_business_stnum1=new[new['cluster_id']==1]
test2 = rateof_1020_business_stnum2=new[new['cluster_id']==2]
test3 = rateof_1020_business_stnum3=new[new['cluster_id']==3]
test4 = rateof_1020_business_stnum4=new[new['cluster_id']==4]
StandardScaler(copy=True, with_mean=True, with_std=True)
StandardScaler(copy=True, with_mean=True, with_std=True)
StandardScaler(copy=True, with_mean=True, with_std=True)
StandardScaler(copy=True, with_mean=True, with_std=True)
StandardScaler(copy=True, with_mean=True, with_std=True)
StandardScaler(copy=True, with_mean=True, with_std=True)
StandardScaler(copy=True, with_mean=True, with_std=True)
In [3]:
Mean = pd.DataFrame({'test0' : test0.mean(), 'test1' : test1.mean(), 'test2' : test2.mean(), 'test3' : test3.mean(), 'test4' : test4.mean()})

Meandf = Mean.T
A = pd.DataFrame(Meandf , columns = ["act_jipyo_value", 'growth_jipyo_value', 'safety_jipyo_value'])
A = A.T
Mean
Out[3]:
test0 test1 test2 test3 test4
dates 2.018100e+05 2.018100e+05 2.018100e+05 2.018100e+05 2.018100e+05
cm_code 9.197671e+02 8.845099e+02 8.810994e+02 8.926687e+02 9.359703e+02
service_code 1.836945e+05 2.233243e+05 1.795623e+05 2.087538e+05 2.057022e+05
over_jisu_value 4.909643e+01 4.884291e+01 4.869486e+01 4.927236e+01 4.850829e+01
act_jipyo_value 5.227208e+01 5.097126e+01 5.110028e+01 5.107899e+01 5.167840e+01
growth_jipyo_value 5.294131e+01 5.241492e+01 5.271026e+01 5.422010e+01 5.685692e+01
safety_jipyo_value 5.393619e+01 5.513603e+01 5.451242e+01 5.514800e+01 5.424042e+01
business_month_avg 5.119408e+01 9.833280e+01 6.496710e+01 6.430312e+01 5.377345e+01
simillar_store_number 3.823245e+00 3.076026e+00 2.589270e+00 3.695828e+00 2.566832e+00
total_moving_people 6.317399e+04 5.215152e+04 5.148100e+04 5.040345e+04 5.558253e+04
10's_moving_people 3.838072e+03 2.908729e+03 2.741772e+03 3.020160e+03 4.326411e+03
20's_moving_people 2.103191e+04 1.233436e+04 1.264564e+04 1.234528e+04 1.620179e+04
30's_moving_people 1.231757e+04 1.032455e+04 1.090367e+04 1.022341e+04 1.020457e+04
40's_moving_people 9.305073e+03 8.956646e+03 8.910369e+03 8.671072e+03 8.714973e+03
50's_moving_people 9.020121e+03 9.399904e+03 8.744852e+03 8.634673e+03 8.653693e+03
60's_moving_people 7.661289e+03 8.227380e+03 7.534758e+03 7.508914e+03 7.481103e+03
c_month_sales_amount 6.552590e+07 6.943711e+07 4.901504e+07 3.245893e+07 2.323462e+07
10's_sales_rate 3.085563e+00 5.557490e-01 1.171325e+00 3.153503e-01 2.167240e+01
20's_sales_rate 4.449788e+01 7.160171e+00 1.589372e+01 1.969259e+00 3.204035e+01
30's_sales_rate 2.145675e+01 1.168973e+01 3.488119e+01 8.835966e+00 1.162611e+01
40's_sales_rate 1.486226e+01 2.046656e+01 2.527697e+01 7.510579e+01 1.692624e+01
50's_sales_rate 1.117632e+01 3.249929e+01 1.504049e+01 1.095512e+01 1.260309e+01
60's_sales_rate 4.921020e+00 2.762921e+01 7.736569e+00 2.819158e+00 5.130569e+00
10's_sales_amount 1.425525e+06 1.536703e+05 3.119693e+05 1.167216e+05 2.304672e+06
20's_sales_amount 2.169803e+07 3.171242e+06 5.803324e+06 3.733779e+05 7.370081e+06
30's_sales_amount 1.428286e+07 7.809284e+06 1.179426e+07 2.743119e+06 2.845062e+06
40's_sales_amount 9.794123e+06 1.394989e+07 1.049252e+07 2.166365e+07 4.133569e+06
50's_sales_amount 7.139031e+06 1.794312e+07 7.142246e+06 5.249386e+06 2.798482e+06
60's_sales_amount 3.365857e+06 1.866219e+07 4.326932e+06 8.020389e+05 1.152618e+06
store_number 3.592464e+00 2.884080e+00 2.779590e+00 2.729445e+00 2.066832e+00
man 3.168699e+04 2.636120e+04 2.607469e+04 2.536836e+04 2.790943e+04
woman 3.148700e+04 2.579031e+04 2.540629e+04 2.503505e+04 2.767305e+04
mon 8.981198e+03 7.392351e+03 7.320350e+03 7.154812e+03 8.004191e+03
tue 8.979360e+03 7.364631e+03 7.289459e+03 7.129785e+03 7.986203e+03
wed 9.048385e+03 7.421137e+03 7.351842e+03 7.189738e+03 7.993401e+03
thu 9.059863e+03 7.397702e+03 7.354633e+03 7.174871e+03 8.009947e+03
fri 9.140334e+03 7.541894e+03 7.505588e+03 7.297655e+03 8.017750e+03
sat 9.221934e+03 7.611253e+03 7.471176e+03 7.329200e+03 7.909814e+03
sun 8.742926e+03 7.422606e+03 7.187982e+03 7.127436e+03 7.661219e+03
weekend 1.796486e+04 1.503386e+04 1.465916e+04 1.445664e+04 1.557103e+04
total_living_people 2.699044e+01 3.006744e+01 2.937495e+01 3.107729e+01 2.920171e+01
10's_living_people 3.493273e+03 4.220969e+03 4.170942e+03 4.589119e+03 4.211127e+03
20's_living_people 4.639316e+03 4.490489e+03 4.405359e+03 4.617123e+03 4.582901e+03
30's_living_people 4.579942e+03 4.790007e+03 4.869595e+03 4.966578e+03 4.588687e+03
40's_living_people 4.136937e+03 4.767118e+03 4.697735e+03 5.038073e+03 4.680908e+03
50's_living_people 4.174471e+03 4.912623e+03 4.693868e+03 4.974403e+03 4.665600e+03
60's_living_people 5.966497e+03 6.886230e+03 6.537451e+03 6.891995e+03 6.472486e+03
cluster_id 0.000000e+00 1.000000e+00 2.000000e+00 3.000000e+00 4.000000e+00
In [4]:
A.plot(figsize=(12, 4), legend=True, fontsize=15)
Mean = pd.DataFrame({'test0' : test0.mean(), 'test1' : test1.mean(), 'test2' : test2.mean(), 'test3' : test3.mean(), 'test4' : test4.mean()})

Meandf = Mean.T
A2 = pd.DataFrame(Meandf , columns = ["10's_sales_rate", "20's_sales_rate", "30's_sales_rate","40's_sales_rate","50's_sales_rate","60's_sales_rate"])
A2 = A2.T
A2.plot(figsize=(12, 4), legend=True, fontsize=15)
Out[4]:
<matplotlib.axes._subplots.AxesSubplot at 0x28982ca6c88>
In [29]:
from matplotlib import font_manager, rc
font_name = font_manager.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()
rc('font', family=font_name)
code_name = pd.read_csv("code_name.csv")
test0 = pd.merge(test0 , code_name, on="service_code")
test1 = pd.merge(test1 , code_name, on="service_code")
test2 = pd.merge(test2 , code_name, on="service_code")
test3 = pd.merge(test3 , code_name, on="service_code")
test4 = pd.merge(test4 , code_name, on="service_code")
plt.rcParams.update({'font.size': 40})
fig = plt.figure(figsize=(50,50))
ax1 = fig.add_subplot(321)
df0 = test0.service_code_name.value_counts()
df0=df0.head(5)
df0.plot.pie(autopct='%.2f%%')
plt.title("service_code")
plt.axis('equal')

ax2 = fig.add_subplot(322)
df1 = test1.service_code_name.value_counts()
df1=df1.head(5)
df1.plot.pie(autopct='%.2f%%')
plt.title("service_code")
plt.axis('equal')

ax3 = fig.add_subplot(323)
df2 = test2.service_code_name.value_counts()
df2=df2.head(5)
df2.plot.pie(autopct='%.2f%%')
plt.title("service_code")
plt.axis('equal')

ax4 = fig.add_subplot(324)
df3 = test3.service_code_name.value_counts()
df3=df3.head(5)
df3.plot.pie(autopct='%.2f%%')
plt.title("service_code")
plt.axis('equal')

ax5 = fig.add_subplot(325)
df4 = test4.service_code_name.value_counts()
df4=df4.head(5)
df4.plot.pie(autopct='%.2f%%')
plt.title("service_code")
plt.axis('equal')
plt.show()
In [39]:
import seaborn as sns
plt.rcParams.update({'font.size': 18})
fig = plt.figure(figsize=(50,50))
sns.lmplot('cluster_id',"10's_sales_rate",data=new,fit_reg=False,
          scatter_kws={"s":1},hue="cluster_id")
plt.title("10's_sales_rate")
sns.lmplot('cluster_id',"20's_sales_rate",data=new,fit_reg=False,
          scatter_kws={"s":1},hue="cluster_id")
plt.title("20's_sales_rate")
sns.lmplot('cluster_id',"30's_sales_rate",data=new,fit_reg=False,
          scatter_kws={"s":1},hue="cluster_id")
plt.title("30's_sales_rate")
sns.lmplot('cluster_id',"40's_sales_rate",data=new,fit_reg=False,
          scatter_kws={"s":1},hue="cluster_id")
plt.title("40's_sales_rate")
sns.lmplot('cluster_id',"50's_sales_rate",data=new,fit_reg=False,
          scatter_kws={"s":1},hue="cluster_id")
plt.title("50's_sales_rate")
sns.lmplot('cluster_id',"60's_sales_rate",data=new,fit_reg=False,
          scatter_kws={"s":1},hue="cluster_id")
plt.title("60's_sales_rate")
Out[39]:
Text(0.5, 1, "60's_sales_rate")
<Figure size 3600x3600 with 0 Axes>
In [46]:
from sklearn.decomposition import PCA
plt.rcParams['axes.unicode_minus'] = False
pca = PCA(n_components=3)
X_scaled = st_data
pca.fit(X_scaled)
plt.rcParams.update({'font.size': 10})
# 처음 두 개의 주성분을 사용해 데이터를 변환합니다
X_pca = pca.transform(X_scaled)
X_pca = pd.DataFrame(X_pca)

print("원본 데이터 형태: {}".format(str(X_scaled.shape)))
print("축소된 데이터 형태: {}".format(str(X_pca.shape)))
clusterid=pd.DataFrame(new,columns=["cluster_id"])
X_pca['cluster_id'] = clusterid

fig = plt.figure(figsize = (15,15))
ax = fig.add_subplot( projection='3d')

x = X_pca[0]
y = X_pca[1]
z = X_pca[2]

ax.scatter(x,y,z, c=X_pca['cluster_id'], marker='o')

ax.set_xlabel('x axis')
ax.set_ylabel('y axis')
ax.set_zlabel('z axis')

plt.show()
원본 데이터 형태: (31547, 7)
축소된 데이터 형태: (31547, 3)
In [4]:
import pandas_profiling as pp
pp.ProfileReport(test0)
Out[4]:

In [5]:
pp.ProfileReport(test1)
Out[5]:

In [6]:
pp.ProfileReport(test2)
Out[6]:

In [7]:
pp.ProfileReport(test3)
Out[7]:

In [8]:
pp.ProfileReport(test4)
Out[8]:

In [11]:
df.head()
Out[11]:
10's_sales_rate 20's_sales_rate 30's_sales_rate 40's_sales_rate 50's_sales_rate 60's_sales_rate simillar_store_number cluster_id
0 -0.417851 -1.016262 -1.326808 -1.352863 0.713260 3.609736 -0.552477 1
1 1.415834 2.611103 -0.381812 -1.352863 -0.788093 -0.607859 0.500985 0
2 -0.211562 -0.176182 -0.259085 -0.234474 0.242879 0.582383 -0.552477 1
3 1.805492 -0.293015 0.501822 -0.132313 -0.382148 -0.148581 -0.552477 2
4 -0.096956 1.270313 0.348413 -0.438795 -0.697883 -0.588453 1.291082 0
In [12]:
X_train, X_test = train_test_split(df, test_size=0.3, random_state=0)
y_train = X_train['cluster_id']
X_train = X_train.drop(['cluster_id'],axis=1)
y_test = X_test['cluster_id']
X_test = X_test.drop(['cluster_id'],axis =1)
X_train.shape, X_test.shape
Out[12]:
((22082, 7), (9465, 7))
In [13]:
forest = RandomForestClassifier(n_estimators=600,max_depth = 12, random_state=42,criterion = 'entropy',max_features='auto')
forest.fit(X_train,y_train)
pred = forest.predict(X_test)
print("훈련 세트 정확도 : {:.3f}".format(forest.score(X_train,y_train)))
print("테스트 세트 정확도 : {:.3f}".format(forest.score(X_test,y_test)))
print("accuracy : ",accuracy_score(y_test,pred))
훈련 세트 정확도 : 0.999
테스트 세트 정확도 : 0.978
accuracy :  0.9782356048600106
In [14]:
estimator = forest.estimators_[5]

from sklearn.tree import export_graphviz
# Export as dot file
export_graphviz(estimator, out_file='tree.dot', 
                feature_names = X_train.columns,
                class_names = 'cluster_id',
                rounded = True, proportion = False, 
                precision = 2, filled = True)

from IPython.display import Image
Image(filename = 'tree.jpeg')
Out[14]:
In [17]:
def plot_feature_importances_cancer(model):

    n_features = X_train.shape[1]

    plt.barh(range(n_features), model.feature_importances_, align='center')

    plt.yticks(np.arange(n_features), X_train.columns)

    plt.xlabel("attr importances")

    plt.ylabel("attr")

    plt.ylim(-1, n_features)
plt.rcParams.update({'font.size': 15})
plt.show()
plot_feature_importances_cancer(forest)
In [20]:
from mpl_toolkits.mplot3d import Axes3D
origin = X_test.copy()
origin['cluster_id'] = y_test
fig = plt.figure(figsize = (15,20))
###############################
ax = fig.add_subplot(321,projection = '3d')
ax.scatter(origin["20's_sales_rate"],
           origin["30's_sales_rate"],
           origin["40's_sales_rate"],c = origin['cluster_id'])
ax.set_xlabel("20's_sales_rate")
ax.set_ylabel("30's_sales_rate")
ax.set_zlabel("40's_sales_rate")
plt.title("original data of test data")

test = X_test.copy()
test['cluster_id'] = pred
ax = fig.add_subplot(322,projection = '3d')
ax.scatter(test["20's_sales_rate"],
           test["30's_sales_rate"],
           test["40's_sales_rate"],c = test['cluster_id'])
ax.set_xlabel("20's_sales_rate")
ax.set_ylabel("30's_sales_rate")
ax.set_zlabel("40's_sales_rate")
plt.title("predict data of test data")
##############################
ax = fig.add_subplot(323,projection = '3d')
ax.scatter(origin["50's_sales_rate"],
           origin["60's_sales_rate"],
           origin["simillar_store_number"],c = origin['cluster_id'])
ax.set_xlabel("50's_sales_rate")
ax.set_ylabel("60's_sales_rate")
ax.set_zlabel("simillar_store_number")
plt.title("original data of test data")

ax = fig.add_subplot(324,projection = '3d')
ax.scatter(test["50's_sales_rate"],
           test["60's_sales_rate"],
           test["simillar_store_number"],c = test['cluster_id'])
ax.set_xlabel("50's_sales_rate")
ax.set_ylabel("60's_sales_rate")
ax.set_zlabel("simillar_store_number")
plt.title("predict data of test data")
##############################
ax = fig.add_subplot(325,projection = '3d')
ax.scatter(origin["10's_sales_rate"],
           origin["50's_sales_rate"],
           origin["simillar_store_number"],c = origin['cluster_id'])
ax.set_xlabel("10's_sales_rate")
ax.set_ylabel("50's_sales_rate")
ax.set_zlabel("simillar_store_number")
plt.title("original data of test data")

ax = fig.add_subplot(326,projection = '3d')
ax.scatter(test["10's_sales_rate"],
           test["50's_sales_rate"],
           test["simillar_store_number"],c = test['cluster_id'])
ax.set_xlabel("10's_sales_rate")
ax.set_ylabel("50's_sales_rate")
ax.set_zlabel("simillar_store_number")
plt.title("predict data of test data")

plt.show()
In [ ]: